# Gather parameters from command line
resultsPath <- params$resultsPath # resultsPath = file.path(getwd(),"Results")
nCores <- params$nCores #parallel::detectCores()
subsetGenes <- params$subsetGenes
subsetCells <- params$subsetCells
resolution <- as.numeric(params$resolution)
interactive <- params$interactive
perplexity <- params$perplexity
root <- "~/Desktop/PD_scRNAseq/"#getwd()
# Have to setwd via knitr
# knitr::opts_knit$set(root.dir=resultsPath, child.path = resultsPath)
knitr::opts_chunk$set(echo=T, error=T, root.dir = resultsPath
# cache=T, cache.lazy=T
)
# Utilize parallel processing later on
cat(paste("**** __Utilized Cores__ **** =", nCores)) ## **** __Utilized Cores__ **** = 2
## $subsetGenes
## [1] "protein_coding"
##
## $subsetCells
## [1] 500
##
## $resolution
## [1] 0.6
##
## $resultsPath
## [1] "./"
##
## $nCores
## [1] 2
##
## $perplexity
## [1] 30
./
NOTE:: When Seurat updated from v2 to v3, they made a lot of problematic changes to the function names and arguments that cause a ton of errors.
library(Seurat)
library(dplyr)
library(gridExtra)
library(knitr)
library(plotly)
library(ggplot2)
library(viridis)
library(reshape2)
library(shiny)
# library(ggrepel)
library(DT)
library(ComplexHeatmap); #BiocManager::install("ComplexHeatmap")
library(EnhancedVolcano); #BiocManager::install('EnhancedVolcano')
## Install Bioconductor
# if (!requireNamespace("BiocManager"))
# install.packages("BiocManager")
library(biomaRt) # BiocManager::install(c("biomaRt"))
library(DESeq2) # BiocManager::install(c("DESeq2"))
# library(enrichR) #BiocManager::install("enrichR")
library(monocle) #BiocManager::install("monocle")
# BiocManager::install("DelayedMatrixStats")
# BiocManager::install("org.Mm.eg.db")
# library(org.Hs.eg.db)
# library(garnett) # devtools::install_github("cole-trapnell-lab/garnett")
# Useful Seurat functions
## Seurat::MultiModal_CCA() # Integrates data from disparate datasets (CIA version too)
sessionInfo()## R version 3.6.2 (2019-12-12)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.5
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 grid stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] monocle_2.99.3 L1Graph_0.1.1
## [3] lpSolveAPI_5.5.2.0-17.6 DDRTree_0.1.5
## [5] irlba_2.3.3 igraph_1.2.4.2
## [7] Matrix_1.2-18 DESeq2_1.24.0
## [9] SummarizedExperiment_1.14.1 DelayedArray_0.10.0
## [11] BiocParallel_1.18.1 matrixStats_0.55.0
## [13] Biobase_2.44.0 GenomicRanges_1.36.1
## [15] GenomeInfoDb_1.20.0 IRanges_2.18.3
## [17] S4Vectors_0.22.1 BiocGenerics_0.30.0
## [19] biomaRt_2.40.5 EnhancedVolcano_1.2.0
## [21] ggrepel_0.8.1 ComplexHeatmap_2.0.0
## [23] DT_0.12 shiny_1.4.0
## [25] reshape2_1.4.3 viridis_0.5.1
## [27] viridisLite_0.3.0 plotly_4.9.2
## [29] ggplot2_3.3.0 knitr_1.28
## [31] gridExtra_2.3 dplyr_0.8.4
## [33] Seurat_3.1.4
##
## loaded via a namespace (and not attached):
## [1] coda_0.19-3 tidyr_1.0.2 acepack_1.4.1
## [4] bit64_0.9-7 multcomp_1.4-12 data.table_1.12.8
## [7] rpart_4.1-15 RCurl_1.98-1.1 doParallel_1.0.15
## [10] metap_1.3 cowplot_1.0.0 TH.data_1.0-10
## [13] RSQLite_2.2.0 RANN_2.6.1 VGAM_1.1-2
## [16] future_1.16.0 bit_1.1-15.2 mutoss_0.1-12
## [19] webshot_0.5.2 httpuv_1.5.2 assertthat_0.2.1
## [22] xfun_0.12 hms_0.5.3 evaluate_0.14
## [25] promises_1.1.0 progress_1.2.2 caTools_1.18.0
## [28] DBI_1.1.0 geneplotter_1.62.0 htmlwidgets_1.5.1
## [31] sparsesvd_0.2 spdep_1.1-3 purrr_0.3.3
## [34] crosstalk_1.0.0 backports_1.1.5 annotate_1.62.0
## [37] gbRd_0.4-11 deldir_0.1-25 RcppParallel_4.4.4
## [40] vctrs_0.2.3 ROCR_1.0-7 withr_2.1.2
## [43] checkmate_2.0.0 sctransform_0.2.1 prettyunits_1.1.1
## [46] mnormt_1.5-6 cluster_2.1.0 ape_5.3
## [49] lazyeval_0.2.2 crayon_1.3.4 genefilter_1.66.0
## [52] units_0.6-5 glmnet_3.0-2 pkgconfig_2.0.3
## [55] slam_0.1-47 nlme_3.1-145 nnet_7.3-13
## [58] rlang_0.4.5 globals_0.12.5 lifecycle_0.2.0
## [61] miniUI_0.1.1.1 sandwich_2.5-1 rsvd_1.0.3
## [64] lmtest_0.9-37 raster_3.0-12 boot_1.3-24
## [67] zoo_1.8-7 base64enc_0.1-3 ggridges_0.5.2
## [70] GlobalOptions_0.1.1 pheatmap_1.0.12 png_0.1-7
## [73] rjson_0.2.20 bitops_1.0-6 KernSmooth_2.23-16
## [76] blob_1.2.1 rgl_0.100.50 classInt_0.4-2
## [79] shape_1.4.4 stringr_1.4.0 manipulateWidget_0.10.1
## [82] jpeg_0.1-8.1 scales_1.1.0 memoise_1.1.0
## [85] magrittr_1.5 plyr_1.8.6 ica_1.0-2
## [88] gplots_3.0.3 bibtex_0.4.2.2 gdata_2.18.0
## [91] zlibbioc_1.30.0 compiler_3.6.2 HSMMSingleCell_1.4.0
## [94] lsei_1.2-0 RColorBrewer_1.1-2 plotrix_3.7-7
## [97] clue_0.3-57 fitdistrplus_1.0-14 LearnBayes_2.15.1
## [100] XVector_0.24.0 listenv_0.8.0 patchwork_1.0.0
## [103] pbapply_1.4-2 htmlTable_1.13.3 Formula_1.2-3
## [106] MASS_7.3-51.5 tidyselect_1.0.0 stringi_1.4.6
## [109] densityClust_0.3 yaml_2.2.1 locfit_1.5-9.1
## [112] latticeExtra_0.6-29 pbmcapply_1.5.0 tools_3.6.2
## [115] future.apply_1.4.0 circlize_0.4.8 rstudioapi_0.11
## [118] foreach_1.4.8 foreign_0.8-76 Rtsne_0.15
## [121] digest_0.6.25 FNN_1.1.3 qlcMatrix_0.9.7
## [124] Rcpp_1.0.3 later_1.0.0 RcppAnnoy_0.0.15
## [127] httr_1.4.1 AnnotationDbi_1.46.1 sf_0.8-1
## [130] npsurv_0.4-0 Rdpack_0.11-1 colorspace_1.4-1
## [133] XML_3.99-0.3 reticulate_1.14 splines_3.6.2
## [136] uwot_0.1.5 expm_0.999-4 sn_1.5-5
## [139] sp_1.4-1 multtest_2.40.0 spData_0.3.3
## [142] xtable_1.8-4 jsonlite_1.6.1 R6_2.4.1
## [145] gmodels_2.18.1 TFisher_0.2.0 Hmisc_4.3-1
## [148] pillar_1.4.3 htmltools_0.4.0 mime_0.9
## [151] glue_1.3.1 fastmap_1.0.1 class_7.3-15
## [154] codetools_0.2-16 tsne_0.1-3 mvtnorm_1.1-0
## [157] lattice_0.20-40 tibble_2.1.3 numDeriv_2016.8-1.1
## [160] leiden_0.3.3 gtools_3.8.1 survival_3.1-8
## [163] limma_3.40.6 rmarkdown_2.1 docopt_0.6.1
## [166] fastICA_1.2-2 munsell_0.5.0 e1071_1.7-3
## [169] GetoptLong_0.1.8 GenomeInfoDbData_1.2.1 iterators_1.0.12
## [172] gtable_0.3.0
## [1] "Seurat 3.1.4"
createDT <- function(DF, caption="", scrollY=500){
data <- DT::datatable(DF, caption=caption,
extensions = 'Buttons',
options = list( dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel', 'pdf', 'print'),
scrollY = scrollY, scrollX=T, scrollCollapse = T, paging = F,
columnDefs = list(list(className = 'dt-center', targets = "_all"))
)
)
return(data)
}
# Need to wrap inside tagList when rendering DTs within a for loop
createDT_html <- function(DF, caption="", scrollY=400){
print( htmltools::tagList( createDT(DF, caption, scrollY)) )
}Rstudio has a default memory limit of only 1GB. To override this, detect the true memory available and set a new limit.
Since the original Seurat object was made with an older version of Seurat, it has to be converted to the updated Seurat V3 object.
## ! IMPORTANT! Must not setwd to local path when launching on cluster
# setwd("~/Desktop/PD_scRNAseq/")
dir.create("./Data", showWarnings=F)
load(file.path(root,"Data/seurat_object_add_HTO_ids.Rdata"))
DAT <- seurat.obj
DAT <- Seurat::UpdateSeuratObject(DAT)## Updating from v2.X to v3.X
## Validating object structure
## Updating object slots
## Ensuring keys are in the proper strucutre
## Ensuring feature names don't have underscores or pipes
## Object representation is consistent with the most current Seurat version
## An object of class Seurat
## 24914 features across 22113 samples within 1 assay
## Active assay: RNA (24914 features)
metadata <- read.table(file.path(root,"Data/meta.data4.tsv"))
createDT( head(metadata), caption = "Metadata") ## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html
# Make AgeGroups
makeAgeGroups <- function(){
dim(metadata)
getMaxRound <- function(vals=metadata$Age, unit=10)unit*ceiling((max(vals)/unit))
getMinRound <- function(vals=metadata$Age, unit=10)unit*floor((min(vals)/unit))
ageBreaks = c(seq(getMinRound(), getMaxRound(), by = 10), getMaxRound()+10)
AgeGroupsUniq <- c()
for (i in 1:(length(ageBreaks)-1)){
AgeGroupsUniq <- append(AgeGroupsUniq, paste(ageBreaks[i],ageBreaks[i+1], sep="-"))
}
data.table::setDT(metadata,keep.rownames = T,check.names = F)[, AgeGroups := cut(Age,
breaks = ageBreaks,
right = F,
labels = AgeGroupsUniq,
nclude.lowest=T)]
metadata <- data.frame(metadata)
unique(metadata$AgeGroups)
head(metadata)
dim(metadata)
return(metadata)
}
# metadata <- makeAgeGroups
DAT <- AddMetaData(object = DAT, metadata = metadata)
# Get rid of any NAs (cells that don't match up with the metadata)
if(subsetCells==F){
DAT <- subset(DAT , nGene > 0 )
} else {
subset(DAT, nGene >0)[,colnames(DAT)[0:subsetCells]]
} An object of class Seurat 24914 features across 495 samples within 1 assay Active assay: RNA (24914 features)
Include only subsets of genes by type. Biotypes from: https://useast.ensembl.org/info/genome/genebuild/biotypes.html
subsetBiotypes <- function(DAT, subsetGenes){
if( subsetGenes!=F ){
cat(paste("Subsetting genes:",subsetGenes, "\n"))
# If the gene_biotypes file exists, import csv. Otherwise, get from biomaRt
if(file_test("-f", file.path(root,"Data/gene_biotypes.csv"))){
biotypes <- read.csv(file.path(root,"Data/gene_biotypes.csv"))
}
else {
ensembl <- useMart(biomart="ENSEMBL_MART_ENSEMBL", host="grch37.ensembl.org",
dataset="hsapiens_gene_ensembl")
ensembl <- useDataset(mart = ensembl, dataset = "hsapiens_gene_ensembl")
listFilters(ensembl)
listAttributes(ensembl)
biotypes <- getBM(attributes=c("hgnc_symbol", "gene_biotype"), filters="hgnc_symbol",
values=row.names(DAT), mart=ensembl)
write.csv(biotypes, file.path(root,"Data/gene_biotypes.csv"), quote=F, row.names=F)
}
# Subset data by creating new Seurat object (annoying but necessary)
geneSubset <- biotypes[biotypes$gene_biotype==subsetGenes,"hgnc_symbol"]
cat(paste(dim(DAT[geneSubset, ])[1],"/", dim(DAT)[1],"genes are", subsetGenes))
# Add back into DAT
DAT <- DAT[geneSubset,]
}
return(DAT)
}
DAT <- subsetBiotypes(DAT, subsetGenes = subsetGenes)## Subsetting genes: protein_coding
## 14827 / 24914 genes are protein_coding
Filter by cells, normalize , filter by gene variability.
## Total Cells: 22113
DAT <- subset(DAT, nGene < 2500 & nGene > 200 & percent.mito < .05)
cat("Filtered Cells:", ncol(DAT))## Filtered Cells: 19144
Important!: * In ScaleData… + Specify do.par = F (unless you have parallel processing set up properly, this will cause your script to crash) + Specify num.cores = nCores (to use all available cores, determined by parallel::detectCores())
Regress out: number of unique transcripts (nUMI), % mitochondrial transcripts (percent.mito)
# Store the top most variable genes in @var.genes
DAT <- FindVariableFeatures(object = DAT,
mean.function = ExpMean,
dispersion.function = LogVMR,
x.low.cutoff = 0.0125,
x.high.cutoff = 3,
y.cutoff = 0.5)## Warning: The following arguments are not used: x.low.cutoff, x.high.cutoff,
## y.cutoff
## Total Genes: 14827
## Highly Variable Genes: 2000
# IMPORTANT!: Must set do.par=T and num.cors = n for large datasets being processed on computing clusters
# IMPORTANT!: Use only the var.genes identified by 'FindVariableGenes' as the 'gene.use' arg in 'ScaleData'
## This will greatly reduced the computational load.
# Ensure CD14 and CD16 are included
appendedGenes <- c(var.genes, "CD14", "FCGR3A")
DAT <- ScaleData(object = DAT,
vars.to.regress = c("nUMI", "percent.mito"))## Regressing out nUMI, percent.mito
## Centering and scaling data matrix
## An object of class Seurat
## 14827 features across 19144 samples within 1 assay
## Active assay: RNA (14827 features)
You can mix genes and metadata variables in these plots just by listing them all in the features argument.
ProjectPCA scores each gene in the dataset (including genes not included in the PCA) based on their correlation with the calculated components. Though we don’t use this further here, it can be used to identify markers that are strongly correlated with cellular heterogeneity, but may not have passed through variable gene selection. The results of the projected PCA can be explored by setting use.full=T in the functions above
# Run PCA with only the top most variables genes
DAT <- RunPCA(object = DAT,
pc.genes = appendedGenes,
do.print=F, verbose=F) #, pcs.print = 1:5, genes.print = 5
PCAPlot(object = DAT, group.by=c("dx","mut","ID"), dims=c(1,2), ncol=2)Determine statistically significant PCs for further analysis. NOTE: This process can take a long time for big datasets, comment out for expediency. More approximate techniques such as those implemented in PCElbowPlot() can be used to reduce computation time
do.fast speeds up t-SNE but makes gives you slightly differnt results each time (unless you set the seed).
DAT <- RunTSNE(object=DAT, reduction.use = "pca",
dims.use = 1:10,
do.fast = T,
perplexity = perplexity,
tsne.method = "Rtsne",
num_threads=nCores,
seed.use = 2019,
verbose=F) # FItSNE
DimPlot(object = DAT, reduction = 'tsne', group.by = c("dx","mut","ID"), ncol=2)Additional UMAP arguments detailed here: https://umap-learn.readthedocs.io/en/latest/api.html#module-umap.umap_
NOTE: Specifying n.components = <large_number> take a little more time but helps clustering later on.
# cat(print(mem_used()))
DAT <- RunUMAP(object = DAT,
reduction = "pca",
dims = 1:10,
verbose=T,
num_threads=nCores,
n.components=10)## Warning: The following arguments are not used: num_threads
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 20:06:16 UMAP embedding parameters a = 0.9922 b = 1.112
## 20:06:16 Read 19144 rows and found 10 numeric columns
## 20:06:16 Using Annoy for neighbor search, n_neighbors = 30
## 20:06:16 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 20:06:19 Writing NN index file to temp file /var/folders/sw/km9819713937hp640lj59ygw0000gn/T//RtmpP9OGec/file7aec5decc50e
## 20:06:19 Searching Annoy index using 1 thread, search_k = 3000
## 20:06:27 Annoy recall = 100%
## 20:06:27 Commencing smooth kNN distance calibration using 1 thread
## 20:06:29 Initializing from normalized Laplacian + noise
## 20:06:29 Commencing optimization for 200 epochs, with 741522 positive edges
## 20:06:47 Optimization finished
# Plot results
DimPlot(object = DAT, reduction = 'umap', group.by = c("dx","mut","ID"), pt.size = .01, ncol=2)Seurat v3 applies a graph-based clustering approach, building upon initial strategies in (Macosko et al). Importantly, the distance metric which drives the clustering analysis (based on previously identified PCs) remains the same. However, our approach to partioning the cellular distance matrix into clusters has dramatically improved. Our approach was heavily inspired by recent manuscripts which applied graph-based clustering approaches to scRNA-seq data [SNN-Cliq, Xu and Su, Bioinformatics, 2015] and CyTOF data [PhenoGraph, Levine et al., Cell, 2015]. Briefly, these methods embed cells in a graph structure - for example a K-nearest neighbor (KNN) graph, with edges drawn between cells with similar feature expression patterns, and then attempt to partition this graph into highly interconnected ‘quasi-cliques’ or ‘communities’.
As in PhenoGraph, we first construct a KNN graph based on the euclidean distance in PCA space, and refine the edge weights between any two cells based on the shared overlap in their local neighborhoods (Jaccard similarity). This step is performed using the FindNeighbors function, and takes as input the previously defined dimensionality of the dataset (first 10 PCs).
To cluster the cells, we next apply modularity optimization techniques such as the Louvain algorithm (default) or SLM [SLM, Blondel et al., Journal of Statistical Mechanics], to iteratively group cells together, with the goal of optimizing the standard modularity function. The FindClusters function implements this procedure, and contains a resolution parameter that sets the ‘granularity’ of the downstream clustering, with increased values leading to a greater number of clusters. We find that setting this parameter between 0.4-1.2 typically returns good results for single-cell datasets of around 3K cells. Optimal resolution often increases for larger datasets. The clusters can be found using the Idents function.
IMPORTANT: Only use the most variable genes for the graph construction step of clustering for much better clustering
# Stash old identity
DAT[["pre_clustering"]] <- Idents(object = DAT)
# DAT <- SetAllIdent(object = DAT, id = "pre_clustering")
# Cluster
DAT <- FindNeighbors(DAT,
dims = 1:10,
reduction = "umap", # can cluster using UMAP or PCA dims (or even directly on the genes)
features = appendedGenes,
force.recalc = T)## Computing nearest neighbor graph
## Computing SNN
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 19144
## Number of edges: 510072
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9593
## Number of communities: 3
## Elapsed time: 1 seconds
# Stash new identity
DAT[["post_clustering"]] <- Idents(object = DAT)
# Plot clusters
DimPlot(object = DAT, label = T)#, do.hover=do.hover, data.hover="mut")## Warning: Using `as.character()` on a quosure is deprecated as of rlang 0.3.0.
## Please use `as_label()` or `as_name()` instead.
## This warning is displayed once per session.
Seurat has several tests for differential expression which can be set with the test.use parameter (see the DE vignette for details). For example, the ROC test returns the ‘classification power’ for any individual marker (ranging from 0 - random, to 1 - perfect).
Shown here: Biomarkers of each cluster vs. all other clusters.
# Limit to only top variable genes?:
# Set arg 'only.pos=F' to capture negative biomarkers
# DAT <- SetIdent(DAT, ident.use = "post_clustering")
DAT.markers <- FindAllMarkers(object = DAT, min.pct = 0.25,
thresh.use = 0.25, only.pos = F,
test.use = "wilcox")## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
DAT.markers <- DAT.markers %>% mutate(FC = 2^avg_logFC)
DAT.markers.sig <- DAT.markers %>% subset(p_val_adj<=0.05)
markers.summary <- DAT.markers.sig %>% group_by(cluster) %>% tally()
# markers.summary <- base::merge(DAT.markers.sig %>% group_by(cluster) %>% tally(),
# DAT.markers %>% group_by(cluster) %>% summarise(mean(avg_logFC)),
# by="cluster" )
createDT(markers.summary, caption = "Number of DEGs and Mean logFC per Cluster")VlnPlot(object = DAT, features = topBiomarkers$gene,
group.by = "post_clustering", log = T, pt.size = .1) +
ggplot2::aes(alpha=0.5) + xlab( "Cluster") + ylab( "Expression")Uses the EnhancedVolcano library. Tutorial. .
##Construct the plot object
volcanoPlot <- function(DEG_df,
caption="",
topFC_labeled=Inf,
FC_cutoff=1,
Q_cutoff=.05,
show_plot=T){
yMax <- max(-log10(DEG_df$p_val_adj)) + max(-log10(DEG_df$p_val_adj))/3
# IMPORTANT!# Must replace 0s with small numbers to avoid getting errors when taking the -log
DEG_df[DEG_df$p_val_adj==0,"p_val_adj"] <- .Machine$double.xmin
rownames(DEG_df) <- DEG_df$gene
labeled_genes <- subset(DEG_df, p_val_adj<Q_cutoff | avg_logFC>=FC_cutoff)
if(nrow(labeled_genes)>topFC_labeled){
labeled_genes <- arrange(labeled_genes, p_val_adj, desc(avg_logFC))[1:topFC_labeled,]
}
xlimit <- max(abs(DEG_df$avg_logFC))*1.1
# EnhancedVolcano library
vol <- EnhancedVolcano::EnhancedVolcano(toptable = DEG_df,
x="avg_logFC",
y="p_val_adj",
transcriptPointSize = 3,
pCutoff = Q_cutoff,
FCcutoff = FC_cutoff,
lab=rownames(DEG_df),
cutoffLineCol = 'grey30',
col = c("grey30", "forestgreen", "royalblue", "red2"),
legend = c("NA",
paste("log2(FC) ≥", FC_cutoff),
paste("q-value <",Q_cutoff),
paste("q-value <",Q_cutoff,"& log2(FC) ≥", FC_cutoff)
),
xlab = "log2(FC)",
ylab = "-log10(q-value)",
# legend = c("Bonferonni < .05",),
# col = c("black","purple","turquoise"),
title=caption,
subtitle = "") +
xlim(c(-xlimit,xlimit)) +
labs(fill="DGE Group") +
theme_bw()
if(show_plot){print(vol)}
return(vol)
# # Custom volcano plot
# DEG_df$sig<- ifelse( DEG_df$p_val_adj<0.05 & DEG_df$avg_logFC<1.5, "p_val_adj<0.05",
# ifelse( DEG_df$p_val_adj<0.05 & DEG_df$avg_logFC>1.5, "p_val_adj<0.05 & avg_logFC>1.5",
# "p_val_adj>0.05"
# ))
# DEG_df <- arrange(DEG_df, desc(sig))
# vol <- ggplot(data=DEG_df, aes(x=avg_logFC, y= -log10(p_val_adj))) +
# geom_point(alpha=0.5, size=3, aes(col=sig)) +
# scale_color_manual(values=list("p_val_adj<0.05"="turquoise3",
# "p_val_adj<0.05 & avg_logFC>1.5"="purple",
# "p_val_adj>0.05" = "darkgray")) +
# theme(legend.position = "none") +
# xlab(expression(paste("Average ",log^{2},"(fold change)"))) +
# ylab(expression(paste(-log^{10},"(p-value)"))) +
# xlim(-2,2) + ylim(0, yMax) +
# ## ggrepl labels
# ggrepel::geom_text_repel(data=labeled_genes,
# aes(label=gene, x=avg_logFC, y= -log10(p_val_adj)),
# color="black", alpha=.5,
# segment.color="black", segment.alpha=.5
# ) +
# # Lines
# geom_vline(xintercept= -1.5,lty=4, lwd=.3, alpha=.5) +
# geom_vline(xintercept= 1.5,lty=4, lwd=.3, alpha=.5) +
# geom_hline(yintercept= -log10(0.05),lty=4, lwd=.3, alpha=.5) +
# ggtitle(caption)
# print(vol)
}A “biomarker” is a set of genes that characterize a given cluster AND differntiate it from all other clusters.
# Run plots
for (clust in unique(DAT.markers$cluster)){
cat(' \n#### Cluster ',clust,': Volcano',' \n')
cap <- paste("Cluster",clust,"Biomarkers")
DEG_df <- subset(DAT.markers, cluster==as.character(clust)) %>% arrange(desc(avg_logFC))
vol <- volcanoPlot(DEG_df = DEG_df, caption = cap, show_plot = F)
print(vol)
createDT_html(DEG_df, caption = cap)
cat(' \n')
} ## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
top1 <- DAT.markers %>% group_by(cluster) %>% top_n(topNum, avg_logFC)
# top1 <- Seurat::
for (clust in unique(top1$cluster)){
subClust <- subset(top1, cluster==clust)
cat(' \n#### Cluster',clust,' \n')
print(paste("Biomarker:",subClust$gene))
# try({
query <- subset(subClust, p_val_adj<0.05)$gene %>% as.character()
gostres <- gprofiler2::gost(query = query, organism = "hsapiens", significant = F)
gprofiler2::gostplot(gostres)
sig.results <- gostres$result %>%
dplyr::mutate(q_value = p.adjust(p = gostres$result$p_value, method = "bonferroni")) %>%
subset(q_value<0.05)
if(nrow(sig.results)>0){
createDT_html(sig.results)
} else {print("No significant GO terms at Bonferonni-corrected p-value < 0.05.")}
# })
# results <- Seurat::FindGeneTerms(QueryGene = subClust$gene) # defunct in SeuratV3
# print(results) #parse_html_notebook(results)
cat(' \n')
}[1] “Biomarker: FCGR3A” “Biomarker: RHOC” “Biomarker: CDKN1C” [4] “Biomarker: IFITM3” “Biomarker: MS4A7” [1] “No significant GO terms at Bonferonni-corrected p-value < 0.05.”
[1] “Biomarker: FCER1A” “Biomarker: CLEC10A” “Biomarker: HLA-DQA1” [4] “Biomarker: HLA-DQA2” “Biomarker: HLA-DQB1” [1] “No significant GO terms at Bonferonni-corrected p-value < 0.05.”
top5 <- DAT.markers %>% group_by(cluster) %>% top_n(5, avg_logFC)
Seurat::DoHeatmap(object = DAT, features = top5$gene, angle = 0)## Picking joint bandwidth of 0.222
## Picking joint bandwidth of 0.14
## Picking joint bandwidth of 0.134
## Picking joint bandwidth of 0.151
## Picking joint bandwidth of 0.135
## Picking joint bandwidth of 0.0824
## Picking joint bandwidth of 0.0743
## Picking joint bandwidth of 0.0677
## Picking joint bandwidth of 0.202
## Picking joint bandwidth of 0.0789
## Picking joint bandwidth of 0.0981
## Picking joint bandwidth of 0.0902
## Picking joint bandwidth of 0.159
## Picking joint bandwidth of 0.084
## Picking joint bandwidth of 0.165
Visualize biomarker expression for each cluster, by disease
top2 <- DAT.markers %>% group_by(cluster) %>% top_n(2, avg_logFC)
sdp <- Seurat::DotPlot(object = DAT, features = top5$gene, cols = c("blue","red"), group.by = c("mut")) +
theme(axis.text.x = element_text(angle=45, hjust = 1))
print(sdp)The following plots show the absolute expression of each biomarker, as opposed to avg_logFC which is dependent on the expression patterns of other cell types being compared.
markerList <- c("CD14", "FCGR3A")
get_markerDF <- function(DAT,
markerList,
meta_vars =c("barcode", "dx", "mut","post_clustering",
"percent.mito","nGene", "nUMI")){
exp <- DAT@assays$RNA@scale.data %>% data.frame()
marker.matrix <- exp[row.names(exp) %in% markerList, ]
marker.matrix$Gene <- row.names(marker.matrix)
markerMelt <- reshape2:::melt.data.frame(marker.matrix, id.vars = "Gene", variable.name = "Cell",value.name = "Expression")
metaSelect <- DAT@meta.data[,meta_vars]
markerDF <- merge(markerMelt,metaSelect, by.x="Cell", by.y="barcode")
return(markerDF)
}
markerDF <- get_markerDF(DAT, markerList)
createDT(head(markerDF), caption = "Known Marker Expression")## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html
# # Explore expression differences between groups
# marker_vs_metadata <- function(markerDF, meta_var){
# # Create title from ANOVA summary
# ANOVAtitle <- function(markerDF, marker){
# nTests <- length(unique(markerDF$Gene))
# res <- anova(lm(data = subset(markerDF, Gene==marker),
# formula = Expression ~ eval(parse(text=meta_var))))
#
# title <-paste(paste("ANOVA (",marker, " x ",meta_var, ")", sep=""),
# ": p=",round(res$`Pr(>F)`,3),
# ", F=",round(res$`F value`,3),
# ifelse(res$`Pr(>F)`<.05/nTests,"(Significant**)",
# "(Non-significant)") )
# }
#
# title = ""
# for (marker in unique(markerDF$Gene) ){
# cat(marker)
# title <- paste(title, "\n", ANOVAtitle(markerDF, marker))
# }
#
#
# ggplot(markerDF, aes(x=Gene, y=Expression, fill=eval(parse(text=meta_var)))) +
# geom_violin() +
# # Beeswarm not great for lots of data
# # ggbeeswarm::geom_beeswarm(alpha=.5, color="turquoise") +
# geom_point( position=position_jitterdodge(jitter.width = .2, dodge.width = .9 ), alpha=0.6, color="turquoise3") +
# labs(title = title, x=meta_var, fill=meta_var) +
# theme(plot.title = element_text( size=10)) +
# scale_fill_manual(values=c("brown", "slategray")) +
# theme_classic()
# }Garnett works with the Monocle library so you need to convert the Seurat object to CDS format (which Monocle uses) first. NOTE: You must install the version of Garnett that is compatible with your version of Monocle. E.g. devtools::install_github("cole-trapnell-lab/garnett", ref="monocle3")
# Convert Seurat object to CDS object
load(file.path(root,"Data/seurat_object_add_HTO_ids.Rdata"), )
mDAT <- monocle::importCDS(seurat.obj, import_all = F)
remove(seurat.obj)
# generate size factors for normalization later
mDAT <- estimateSizeFactors(mDAT)
# Get pre-trained PBMC classifer
# Download from: https://cole-trapnell-lab.github.io/garnett/classifiers
classifier_file <- "./Data/hsPBMC_20191017.RDS"
if(!file.exists(classifier_file)){
download.file(url="https://cole-trapnell-lab.github.io/garnett/classifiers/hsPBMC_20191017.RDS",
destfile = "./Data/hsPBMC_20191017.RDS")
}
hsPBMC <- readRDS("./Data/hsPBMC_20191017.RDS")
# Get feature genes for each cell type
feature_genes <- get_feature_genes(hsPBMC,
node = "root",
db = org.Hs.eg.db,
convert_ids = T)
head(feature_genes)
library(org.Hs.eg.db)
mDAT <- classify_cells(cds = mDAT,
classifier = hsPBMC,
db = org.Hs.eg.db,
cluster_extend = TRUE,
cds_gene_id_type = "SYMBOL")
head(pData(mDAT))
table(pData(mDAT)$cell_type)
table(pData(mDAT)$cluster_ext_type)
# Run tSNE: Plot Clusters and Cell Types
mDAT <- reduceDimension(mDAT, max_components = 3, reduction_method = "tSNE")
commonGeoms <- labs(x="tSNE1",y="tSNE2")
plot_grid(nrow = 2,
qplot(data = pData(mDAT), mDAT@reducedDimA[1,], mDAT@reducedDimA[2,], color = cell_type) + theme_bw() + commonGeoms,
qplot(data = pData(mDAT), mDAT@reducedDimA[1,], mDAT@reducedDimA[2,], color = cluster_ext_type) + theme_bw() + commonGeoms
)
# Unsupervised Clustering
mDAT <- clusterCells(mDAT, num_clusters = 5)
pData(mDAT)
plot_cell_clusters(mDAT, 1, 2, color = "Cluster", markers = c("CD14", "FCGR3A"))
plot_cell_clusters(mDAT, 1, 2, color = "Cluster") + facet_wrap(~dx)
plot_cell_clusters(mDAT, 1, 2, color = "Cluster") + facet_wrap(~mut)
plot_cell_clusters(mDAT, 1, 2, color = "Cluster") + facet_wrap(~cell_type)
DAT <- Seurat::AddMetaData(DAT, pData(mDAT)[c("garnett_cluster","cell_type","cluster_ext_type","Cluster")])WARNING: Very computationally expensive!
mDAT <- reduceDimension(mDAT, max_components = 2, method = 'DDRTree')
mDAT <- orderCells(mDAT)
plot_cell_trajectory(mDAT, color_by = "dx")
plot_cell_trajectory(mDAT, color_by = "cell_type")
plot_cell_trajectory(HSMM_myo, color_by = "cell_type") +
facet_wrap(~mut, nrow = 2)markerDF <- get_markerDF(DAT, markerList,
meta_vars =c("barcode", "dx", "mut","ID","post_clustering", "percent.mito","nGene", "nUMI"))
Spectral <- grDevices::colorRampPalette(RColorBrewer::brewer.pal(length(unique(DAT@meta.data$mut)), "Spectral"))
# DAT <- DoKMeans(DAT, k.genes = 3)
# KMeansHeatmap(DAT)
if (T==F){
# Spectral <- heatmaply::Spectral(length(unique(DAT@meta.data$mut)))
markerMelt <- reshape2::acast(markerDF, Cell~Gene, value.var="Expression", fun.aggregate = mean, drop = F, fill = 0)
heatmaply::heatmaply(markerMelt, key.title="Expression",#plot_method= "ggplot",
k_row = dim(markerMelt)[2], dendrogram = "row",
showticklabels = c(T, F), xlab = "Known Markers", ylab = "Cells", column_text_angle = 45,
row_side_colors = DAT@meta.data[,c("dx","mut", "cell_type")], row_side_palette = Spectral
) %>% colorbar(tickfont = list(size = 12), titlefont = list(size = 14), which = 2) %>%
colorbar(tickfont = list(size = 12), titlefont = list(size = 14), which = 1)
}else{
# markerDF_sub <-subset(markerDF, Gene==markerList[1])
# var_to_colors(markerDF_sub, "post_clustering")
# library(pheatmap)
# pheatmap(markerMelt, annotation_row = markerDF_sub[c("dx","mut","cell_type")])
# pheatmap(markerMelt, kmeans_k = NA, annotation_row = markerDF_sub[c("dx","mut","cell_type")],
# cluster_cols = F, cutree_rows = length(unique(markerDF$post_clustering)), angle_col=45 )
library(RColorBrewer)
var_to_colors <- function(markerDF, metaVar){
colors <- brewer.pal(length(unique(markerDF[metaVar]) ), "Dark2")
sample(colors, length(unique(markerDF[metaVar])), replace = TRUE, prob = NULL)
# metaColors <- colors[ subset(markerDF, Gene==markerList[1])[metaVar][,1] %>% as.factor() ]
return(metaColors)
}
# library(GMD)
# myCols = cbind(var_to_colors(markerDF, "dx"), var_to_colors(markerDF, "mut"))
# rlab=t(cbind(
# var_to_colors(markerDF, "post_clustering"),
# var_to_colors(markerDF, "dx")
# ))
# heatmap.2(marker.matrix, key.title="Expression", col = viridis(300), trace="none",Colv = F, Rowv = F,
# labRow = F, xlab = "Biomarker", ylab="Cell", cexCol=1, RowSideColors = var_to_colors(markerDF, "post_clustering")
# )
# heatmap.3(markerMelt, dendrogram = 'row', kr = length(unique(markerDF)), labRow = F,
# xlab = "Biomarker", ylab = "Cell", RowSideColors = rlab, RowSideColorsSize=2 )
markerDF <- markerDF %>%
mutate_at(vars(post_clustering, dx, mut, ID), as.factor) %>%
mutate(Cluster = post_clustering) %>%
arrange(post_clustering)
# markerMelt <- reshape2::acast(markerDF, Cell~Gene, value.var="Expression", fun.aggregate = mean, drop = F, fill = 0)
markerMelt <- dcast(markerDF, Cell + post_clustering + dx + mut + ID ~ Gene,
fun.aggregate = mean, value.var = "Expression") %>% arrange(post_clustering)
marker.matrix <- markerMelt[markerList] %>%as.matrix()
row.names(marker.matrix) <- markerMelt$Cell
ha = HeatmapAnnotation(df = markerDF[c("dx","mut","ID","post_clustering")], which = "row")
ComplexHeatmap::Heatmap(marker.matrix, col=viridis(300), column_title = "Biomarker", row_title = "Cell",
row_dend_reorder = F,show_row_names = F, show_column_dend = F,show_row_dend =T,
cluster_rows = T, column_title_side = "bottom",km = length(unique(markerMelt$post_clustering))) + ha
print(ha)
} ## A HeatmapAnnotation object with 4 annotations
## name: heatmap_annotation_0
## position: row
## items: 38288
## width: 21.0543794105438mm
## height: 1npc
## this object is subsetable
## 29.5326666666667mm extension on the bottom
##
## name annotation_type color_mapping width
## dx discrete vector random 5mm
## mut discrete vector random 5mm
## ID discrete vector random 5mm
## post_clustering discrete vector random 5mm
markerDF <- markerDF %>% mutate(Cluster = post_clustering)
# Show mean exp for each marker
avgMarker <- markerDF %>% group_by(Gene, Cluster) %>% summarise(meanExp = mean(Expression))
p <- ggplot(data = avgMarker, aes(x=Gene, y=Cluster, fill=meanExp)) %>% + geom_tile() + scale_fill_viridis()
print(p)DGE methods available in Seurat include:
“wilcox” : Identifies differentially expressed genes between two groups of cells using a Wilcoxon Rank Sum test (default)
“bimod” : Likelihood-ratio test for single cell gene expression, (McDavid et al., Bioinformatics, 2013)
“roc” : Identifies ‘markers’ of gene expression using ROC analysis. For each gene, evaluates (using AUC) a classifier built on that gene alone, to classify between two groups of cells. An AUC value of 1 means that expression values for this gene alone can perfectly classify the two groupings (i.e. Each of the cells in cells.1 exhibit a higher level than each of the cells in cells.2). An AUC value of 0 also means there is perfect classification, but in the other direction. A value of 0.5 implies that the gene has no predictive power to classify the two groups. Returns a ‘predictive power’ (abs(AUC-0.5) * 2) ranked matrix of putative differentially expressed genes.
“t” : Identify differentially expressed genes between two groups of cells using the Student’s t-test.
“negbinom” : Identifies differentially expressed genes between two groups of cells using a negative binomial generalized linear model. Use only for UMI-based datasets
“poisson” : Identifies differentially expressed genes between two groups of cells using a poisson generalized linear model. Use only for UMI-based datasets
“LR” : Uses a logistic regression framework to determine differentially expressed genes. Constructs a logistic regression model predicting group membership based on each feature individually and compares this to a null model with a likelihood ratio test.
“MAST” : Identifies differentially expressed genes between two groups of cells using a hurdle model tailored to scRNA-seq data. Utilizes the MAST package to run the DE testing.
“DESeq2” : Identifies differentially expressed genes between two groups of cells based on a model using DESeq2 which uses a negative binomial distribution (Love et al, Genome Biology, 2014).This test does not support pre-filtering of genes based on average difference (or percent detection rate) between cell groups. However, genes may be pre-filtered based on their minimum detection rate (min.pct) across both cell groups. To use this method, please install DESeq2, using the instructions at https://bioconductor.org/packages/release/bioc/html/DESeq2.html
# Available DGE methods:
## "wilcox", "bimod", "roc", "t", "tobit", "poisson", "negbinom", "MAST", "DESeq2"
runDGE <- function(cds,
meta_var,
group1, group2,
test.use="MAST",
show_plot=T,
show_table=T,
save_path="./Results/Seurat",
compress=F,
cluster="allClusters"){
#print(paste("DGE_allCells",meta_var,sep="_"))
# DAT <- SetAllIdent(DAT, id = meta_var)
# DAT <- StashIdent(DAT, save.name = meta_var)
# DAT[[meta_var]] <- Idents(object = DAT)
cds <- SetIdent(cds, value=meta_var)
DEGs <- FindMarkers(object = cds,
ident.1=group1,
ident.2=group2,
test.use=test.use,
min.pct = 0.1,
logfc.threshold = 0.25,
min.cells.feature = 3,
min.cells.group = 3,
only.pos = F,
verbose = F)
DEGs$gene <- row.names(DEGs)
cds <- SetIdent(cds, value="post_clustering")
cap <- paste("Cluster",cluster,"DEGs:",group1, "vs.", group2)
if(show_table){
createDT(DEGs, caption = cap)
}
if(show_plot){
vol <- volcanoPlot(DEGs, caption = cap, show_plot = F)
print(vol)
}
if(save_path!=F){
res_path <- file.path(save_path,
paste0("DGE.results_",meta_var,"_cluster",cluster,".tsv",
ifelse(compress,".gz","")))
message("Saving DGE results ==> ",res_path)
dir.create(dirname(res_path), showWarnings = F, recursive = T)
data.table::fwrite(DEGs,
file = res_path,
sep="\t",
nThread = 4)
}
return(DEGs)
}## Assuming data assay in position 1, with name et is log-transformed.
##
## Done!
## Combining coefficients and standard errors
## Calculating log-fold changes
## Calculating likelihood ratio tests
## Refitting on reduced model...
##
## Done!
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Saving DGE results ==> ./Results/Seurat/DGE.results_dx_clusterallClusters.tsv
## Assuming data assay in position 1, with name et is log-transformed.
##
## Done!
## Combining coefficients and standard errors
## Calculating log-fold changes
## Calculating likelihood ratio tests
## Refitting on reduced model...
##
## Done!
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Saving DGE results ==> ./Results/Seurat/DGE.results_mut_clusterallClusters.tsv
# NOTE: If you try to wrap the functions below into a function, it will just duplicate the results of the last cluster for all the others. This is totally bizarre...
meta_var = "dx"; group1 = "PD"; group2 = "control";
for (clust in unique(DAT@meta.data$post_clustering)){
DAT_clustSub<-NULL; DEGs<-NULL
# Subset cells by cluster
cat(' \n###',paste('Cluster ',clust,'- ',group1,' vs. ', group2, sep='') , ' \n')
DAT_clustSub <- subset(DAT, post_clustering==clust)
print(DAT_clustSub)
DEGs <-runDGE(cds = DAT_clustSub,
meta_var = meta_var,
group1 = group1, group2 = group2,
test.use = "MAST",
cluster = clust,
show_plot = F,
show_table = F)
vol <- volcanoPlot(DEG_df = DEGs,
caption = paste("Cluster",clust,"DEGs:",group1,"vs.",group2),
show_plot = F)
print(vol)
createDT_html(DEGs)
cat(' \n')
} An object of class Seurat 14827 features across 15971 samples within 1 assay Active assay: RNA (14827 features) 3 dimensional reductions calculated: pca, tsne, umap
## Assuming data assay in position 1, with name et is log-transformed.
##
## Done!
## Combining coefficients and standard errors
## Warning in melt(coefAndCI, as.is = TRUE): The melt generic in data.table has
## been passed a array and will attempt to redirect to the relevant reshape2
## method; please note that reshape2 is deprecated, and this redirection is now
## deprecated as well. To continue using melt methods from reshape2 while both
## libraries are attached, e.g. melt.list, you can prepend the namespace like
## reshape2::melt(coefAndCI). In the next version, this warning will become an
## error.
## Calculating log-fold changes
## Warning in melt(lfc): The melt generic in data.table has been passed a list
## and will attempt to redirect to the relevant reshape2 method; please note that
## reshape2 is deprecated, and this redirection is now deprecated as well. To
## continue using melt methods from reshape2 while both libraries are attached,
## e.g. melt.list, you can prepend the namespace like reshape2::melt(lfc). In the
## next version, this warning will become an error.
## Calculating likelihood ratio tests
## Refitting on reduced model...
##
## Done!
## Warning in melt(llrt): The melt generic in data.table has been passed a list
## and will attempt to redirect to the relevant reshape2 method; please note that
## reshape2 is deprecated, and this redirection is now deprecated as well. To
## continue using melt methods from reshape2 while both libraries are attached,
## e.g. melt.list, you can prepend the namespace like reshape2::melt(llrt). In the
## next version, this warning will become an error.
## Saving DGE results ==> ./Results/Seurat/DGE.results_dx_cluster0.tsv
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
An object of class Seurat 14827 features across 2432 samples within 1 assay Active assay: RNA (14827 features) 3 dimensional reductions calculated: pca, tsne, umap
## Assuming data assay in position 1, with name et is log-transformed.
##
## Done!
## Combining coefficients and standard errors
## Warning in melt(coefAndCI, as.is = TRUE): The melt generic in data.table has
## been passed a array and will attempt to redirect to the relevant reshape2
## method; please note that reshape2 is deprecated, and this redirection is now
## deprecated as well. To continue using melt methods from reshape2 while both
## libraries are attached, e.g. melt.list, you can prepend the namespace like
## reshape2::melt(coefAndCI). In the next version, this warning will become an
## error.
## Calculating log-fold changes
## Warning in melt(lfc): The melt generic in data.table has been passed a list
## and will attempt to redirect to the relevant reshape2 method; please note that
## reshape2 is deprecated, and this redirection is now deprecated as well. To
## continue using melt methods from reshape2 while both libraries are attached,
## e.g. melt.list, you can prepend the namespace like reshape2::melt(lfc). In the
## next version, this warning will become an error.
## Calculating likelihood ratio tests
## Refitting on reduced model...
##
## Done!
## Warning in melt(llrt): The melt generic in data.table has been passed a list
## and will attempt to redirect to the relevant reshape2 method; please note that
## reshape2 is deprecated, and this redirection is now deprecated as well. To
## continue using melt methods from reshape2 while both libraries are attached,
## e.g. melt.list, you can prepend the namespace like reshape2::melt(llrt). In the
## next version, this warning will become an error.
## Saving DGE results ==> ./Results/Seurat/DGE.results_dx_cluster1.tsv
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
An object of class Seurat 14827 features across 741 samples within 1 assay Active assay: RNA (14827 features) 3 dimensional reductions calculated: pca, tsne, umap
## Assuming data assay in position 1, with name et is log-transformed.
##
## Done!
## Combining coefficients and standard errors
## Warning in melt(coefAndCI, as.is = TRUE): The melt generic in data.table has
## been passed a array and will attempt to redirect to the relevant reshape2
## method; please note that reshape2 is deprecated, and this redirection is now
## deprecated as well. To continue using melt methods from reshape2 while both
## libraries are attached, e.g. melt.list, you can prepend the namespace like
## reshape2::melt(coefAndCI). In the next version, this warning will become an
## error.
## Calculating log-fold changes
## Warning in melt(lfc): The melt generic in data.table has been passed a list
## and will attempt to redirect to the relevant reshape2 method; please note that
## reshape2 is deprecated, and this redirection is now deprecated as well. To
## continue using melt methods from reshape2 while both libraries are attached,
## e.g. melt.list, you can prepend the namespace like reshape2::melt(lfc). In the
## next version, this warning will become an error.
## Calculating likelihood ratio tests
## Refitting on reduced model...
##
## Done!
## Warning in melt(llrt): The melt generic in data.table has been passed a list
## and will attempt to redirect to the relevant reshape2 method; please note that
## reshape2 is deprecated, and this redirection is now deprecated as well. To
## continue using melt methods from reshape2 while both libraries are attached,
## e.g. melt.list, you can prepend the namespace like reshape2::melt(llrt). In the
## next version, this warning will become an error.
## Saving DGE results ==> ./Results/Seurat/DGE.results_dx_cluster2.tsv
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
meta_var = "mut"; group1 = "LRRK2"; group2 = "PD";
for (clust in unique(DAT@meta.data$post_clustering)){
DAT_clustSub<-NULL; DEGs<-NULL
# Subset cells by cluster
cat(' \n###',paste('Cluster ',clust,'- ',group1,' vs. ', group2, sep='') , ' \n')
DAT_clustSub <- subset(DAT, post_clustering==clust)
print(DAT_clustSub)
DEGs <-runDGE(cds = DAT_clustSub,
meta_var = meta_var,
group1 = group1, group2 = group2,
test.use = "MAST",
cluster = clust,
show_plot = F,
show_table = F)
vol <- volcanoPlot(DEG_df = DEGs,
caption = paste("Cluster",clust,"DEGs:",group1,"vs.",group2),
show_plot = F)
print(vol)
createDT_html(DEGs)
cat(' \n')
} An object of class Seurat 14827 features across 15971 samples within 1 assay Active assay: RNA (14827 features) 3 dimensional reductions calculated: pca, tsne, umap
## Assuming data assay in position 1, with name et is log-transformed.
##
## Done!
## Combining coefficients and standard errors
## Calculating log-fold changes
## Calculating likelihood ratio tests
## Refitting on reduced model...
##
## Done!
## Saving DGE results ==> ./Results/Seurat/DGE.results_mut_cluster0.tsv
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
An object of class Seurat 14827 features across 2432 samples within 1 assay Active assay: RNA (14827 features) 3 dimensional reductions calculated: pca, tsne, umap
## Assuming data assay in position 1, with name et is log-transformed.
##
## Done!
## Combining coefficients and standard errors
## Calculating log-fold changes
## Calculating likelihood ratio tests
## Refitting on reduced model...
##
## Done!
## Saving DGE results ==> ./Results/Seurat/DGE.results_mut_cluster1.tsv
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
An object of class Seurat 14827 features across 741 samples within 1 assay Active assay: RNA (14827 features) 3 dimensional reductions calculated: pca, tsne, umap
## Assuming data assay in position 1, with name et is log-transformed.
##
## Done!
## Combining coefficients and standard errors
## Calculating log-fold changes
## Calculating likelihood ratio tests
## Refitting on reduced model...
##
## Done!
## Saving DGE results ==> ./Results/Seurat/DGE.results_mut_cluster2.tsv
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
enrichr_dbs <- c("KEGG_2018", "Reactome_2016",
"GO_Biological_Process_2018", "GO_Molecular_Function_2018", "GO_Cellular_Component_2018",
"Rare_Diseases_AutoRIF_ARCHS4_Predictions", "Human_Gene_Atlas")
createDT(enrichR::listEnrichrDbs(), "Enrichr Databases")## Welcome to enrichR
## Checking connection ... Connection is Live!
for (clust in unique(DAT.markers.sig$cluster)){
cat(' \n###',' Cluster ',clust,'{.tabset .tabset-fade} \n')
geneList <- subset(DAT.markers.sig, cluster==clust)$gene %>% as.character()
results <- enrichR::enrichr(genes = geneList, databases = enrichr_dbs )
for (db in enrichr_dbs){
cat(' \n#### ',db,' \n')
createDT_html(subset(results[[db]], Adjusted.P.value<=0.05), paste("Enrichr Results: ",db,"Cluster ", clust))
cat(' \n')
}
cat(' \n')
} Uploading data to Enrichr… Done. Querying KEGG_2018… Done. Querying Reactome_2016… Done. Querying GO_Biological_Process_2018… Done. Querying GO_Molecular_Function_2018… Done. Querying GO_Cellular_Component_2018… Done. Querying Rare_Diseases_AutoRIF_ARCHS4_Predictions… Done. Querying Human_Gene_Atlas… Done. Parsing results… Done.
Uploading data to Enrichr… Done. Querying KEGG_2018… Done. Querying Reactome_2016… Done. Querying GO_Biological_Process_2018… Done. Querying GO_Molecular_Function_2018… Done. Querying GO_Cellular_Component_2018… Done. Querying Rare_Diseases_AutoRIF_ARCHS4_Predictions… Done. Querying Human_Gene_Atlas… Done. Parsing results… Done.
Uploading data to Enrichr… Done. Querying KEGG_2018… Done. Querying Reactome_2016… Done. Querying GO_Biological_Process_2018… Done. Querying GO_Molecular_Function_2018… Done. Querying GO_Cellular_Component_2018… Done. Querying Rare_Diseases_AutoRIF_ARCHS4_Predictions… Done. Querying Human_Gene_Atlas… Done. Parsing results… Done.
eigengenes <- read.delim("Data/bulkMonocytes_WGCNAmodules_geneMembership.txt", row.names = NULL)
modules <- read.delim("Data/bulkMonocytes_WGCNAmodules_geneModules.txt", row.names = NULL, sep = "",
col.names = c("Ensembl","moduleColors"))
modules <- base::merge(eigengenes, modules,by="Ensembl" )
for (mod in unique(modules$moduleColors)){
cat(' \n###','Module ",mod,"{.tabset .tabset-fade}',' \n')
geneList <- subset(modules, moduleColors==mod)$symbol %>% as.character()
results <- enrichr(genes = geneList, databases = enrichr_dbs )
for (db in enrichr_dbs){
cat(' \n')
cat(' \n####',db,' \n')
createDT_html(subset(results[[db]], Adjusted.P.value<=0.05), paste("Enrichr Results:",db,"Module", mod))
cat(' \n')
}
cat(' \n')
}Determine whether each of the clusters in scRNA-seq data are enriched for WGCNA eigengenes (a weighted vector of all genes representing each co-expression module).
#Get the average expression of every gene in each cluster
allGenes <- get_markerDF(DAT, markerList = row.names(DAT@scale.data), meta_vars = c("post_clustering", "barcode") )
clusterGeneAvg <- allGenes %>% group_by(post_clustering, Gene) %>% summarise(meanExp = mean(Expression))
eigengenes_filt <- subset(eigengenes,symbol %in% unique(clusterGeneAvg$Gene))
clusts_by_mods <- base::merge(clusterGeneAvg[c("Gene","meanExp")], eigengenes_filt[c("symbol", modName)],
by.x="Gene", by.y="symbol")
cor.test()
corrplot()
heatmap.2
f <- function(module){
eigengene <- eigengenes[paste0("MM", mod)]
means <- tapply(eigengenes, DAT@meta.data$post_clustering, mean, na.rm = T)
return(means)
}
modules <- c("blue", "brown", "green", "turquoise", "yellow")
plotdat <- sapply(modules, f)
matplot(plotdat, col = modules, type = "l", lwd = 2, xaxt = "n", xlab = "Seurat Cluster",
ylab = "WGCNA Module Eigengene")
axis(1, at = 1:16, labels = 0:15)
matpoints(plotdat, col = modules, pch = 21)library(RRHO) #BiocManager::install("RRHO")
# list.length <- 100
# list.names <- paste('Gene',1:list.length, sep='')
# gene.list1<- data.frame(list.names, sample(100))
# gene.list2<- data.frame(list.names, sample(100))
for (clust in unique(DAT.markers.sig)){
# Compare each cluster
subClust <- subset(DAT.markers.sig, cluster==clust) %>% arrange(desc(avg_logFC))
for (mod in unique(modules$moduleColors)){
# Sort genes by module membership
modName <-paste("MM",mod,sep="")
subMod <- subset(modules, moduleColors==mod) %>% arrange(desc(eval(parse(text = modName))))
maxGenes <- min(length(subClust$gene), subMod$symbol) %>% as.numeric()
list1 <- subClust[1:maxGenes, c("gene","FC")] %>% dplyr::rename(value=FC)
list2 <- subMod[1:maxGenes, c("symbol",modName)] %>% dplyr::rename(gene=symbol, value=modName)
RRHO_path <-file.path("RRHO_results",paste(paste("Cluster",clust,sep=""),"vs",modName,sep="_"))
dir.create(RRHO_path,recursive = T, showWarnings = F)
RRHO_results <- RRHO(list1=list1, list2=list2,
labels = c(paste("Cluster",clust,sep="_"), paste("Module",mod,sep="_")),
plots = T, alternative = "enrichment", outputdir = RRHO_path, BY=TRUE
)
lattice::levelplot(RRHO_results$hypermat)
# Pval testing
pval.testing <- pvalRRHO(RRHO_results, 50)
pval.testing$pval
xs<- seq(0, 10, length=100)
plot(Vectorize(pval.testing$FUN.ecdf)(xs)~xs, xlab='-log(pvalue)', ylab='ECDF', type='S')
lattice::levelplot(RRHO_results$hypermat.by)
}
}